##########################################
# Replication Data for Proksch, Lowe, Wäckerle, Soroka. (2018). Multilingual Sentiment Analysis: A New Approach to Measuring Conflict in Legislative Speeches. Legislative Studies Quarterly, Forthcoming.
##########################################

#Part 3: Wordscores Replication
#Most of this code follows the replication provided by Herzog and Benoit for the article "The Most Unkindest Cuts: Speaker Selection and Expressed Government Dissent During Economic Crisis"
# The code is split in several parts, because we need to install an old version of quanteda in order to get the original code to work. We then need to switch to a more recent version for the sentiment analysis.

rm(list = ls(all = TRUE))
library(rstudioapi)

current_path <- getActiveDocumentContext()$path 
setwd(dirname(current_path ))

require(devtools)
detach("package:quanteda", unload=TRUE)
remove.packages("quanteda")
install_version("quanteda", version = "0.8.0-3", repos = "http://cran.us.r-project.org")#need old quanteda version to replicate findings
require(quanteda)
require(tidyverse)
require(reshape2)

# read in and process speech data
d <- read.delim("Data_budget_debates_1983-2012.tab", stringsAsFactors=FALSE) 
names(d)[names(d)=="year"] <- "debate_year"  

# put into a corpus
budgetCorpus <- corpus(as.character(d[, "speech"]), 
                       source="budget_debates_1983-2012.tab from an SQL dump",
                       notes="From Alex Herzog's database of Irish speech data.")
# assign document vars
docvars(budgetCorpus) <- d[, which(names(d)!="speech")]

# rules for budget_year
debate_year <- d$debate_year
month <- d$month
budget_year <- rep(NA, length(debate_year))
# 1983-1996: budget debates are held at the beginning of the year
budget_year[debate_year>=1983 & debate_year<=1996] <- debate_year[debate_year>=1983 & debate_year<=1996]
# 1997: there is one budget debate in January/February 1997 for the 1997 budget, and one in December 1997 for the 1998 budget
budget_year[debate_year==1997 & month<=2] <- 1997
budget_year[debate_year==1997 & month==12] <- 1998
# 1998-2008 and 2010-2012: budget debates are held in December for the next year
budget_year[(debate_year>=1998 & debate_year<=2008) | (debate_year>=2010 & debate_year<=2012)] <- debate_year[(debate_year>=1998 & debate_year<=2008) | (debate_year>=2010 & debate_year<=2012)] + 1
# 2009: there is one debate in April for the supplementary budget and one in Dec for the 2010 budget
budget_year[debate_year==2009 & month==4] <- 0000
budget_year[debate_year==2009 & month==12] <- 2010
table(budget_year, debate_year, useNA="ifany")
# add to corpus
docvars(budgetCorpus, "budget_year") <- budget_year
rm(debate_year, budget_year, month)



### FUNCTIONS ##########################
# clean-up member name 
clean.name <- function(x) {
    mname <- as.character(x)
    mname <- sub("RIP","",mname, ignore.case=T)
    mname <- gsub("\\(.+?\\)","",mname)
    mname <- gsub("^.{2}\\. ", "" , mname)
    mname <- sub("Ó ","Ó",mname, ignore.case=T)
    mname <- gsub("^Mr ","",mname, ignore.case=T)    
    mname <- gsub("^Ms ","",mname, ignore.case=T)
    mname <- gsub("\\s{2,}", " ", mname, perl=TRUE)    
    return(mname)
}

# retrieve last name
get.lname <- function(x) {
    mname <- clean.name(x)
    lnames <- sapply(strsplit(mname, " "), function(y) y[length(y)])
    return(lnames)
}

# retrieve first name
get.fname <- function(x) {
    mname <- clean.name(x)
    lnames <- sapply(strsplit(mname, " "),function(y) y[1])
    return(lnames)
}


## flag speakers who were LEAS-CHEANN COMHAIRLE
docvars(budgetCorpus, "leas_cheann_comhairle") <- 
    (docvars(budgetCorpus, "member_name")=="Mr. John J. Ryan" & docvars(budgetCorpus, "budget_year") %in% 1983:1986) |
    (docvars(budgetCorpus, "member_name")=="Mr. Jim Tunney" & docvars(budgetCorpus, "budget_year") %in% 1987:1992) |
    (docvars(budgetCorpus, "member_name")=="Mr. Joe Jacob" & docvars(budgetCorpus, "budget_year") %in% 1993:1997) |
    (docvars(budgetCorpus, "member_name")=="Dr. Rory O'Hanlon" & docvars(budgetCorpus, "budget_year") %in% 1998:2002) |
    (docvars(budgetCorpus, "member_name")=="Mr. Séamus Pattison" & docvars(budgetCorpus, "budget_year") %in% 2003:2007) |
    (docvars(budgetCorpus, "member_name")=="Mr. Brendan Howlin" & docvars(budgetCorpus, "budget_year") %in% 2008:2011) |
    (docvars(budgetCorpus, "member_name")=="Mr. Michael P. Kitt" & docvars(budgetCorpus, "budget_year") %in% 2012:2013) 
# table(docvars(subset(budgetCorpus, leas_cheann_comhairle==TRUE), "member_name"), docvars(subset(budgetCorpus, leas_cheann_comhairle==TRUE), "budget_year"))

## flag speakers who were CHEANN COMHAIRLE
docvars(budgetCorpus, "cheann_comhairle") <- 
    (docvars(budgetCorpus, "member_name")=="Mr. Thomas J. (Cavan) Fitzpatrick" & docvars(budgetCorpus, "budget_year") %in% 1983:1986) |
    (docvars(budgetCorpus, "member_name")=="Mr. Seán Treacy" & docvars(budgetCorpus, "budget_year") %in% 1987:1997) |
    (docvars(budgetCorpus, "member_name")=="Mr. Séamus Pattison" & docvars(budgetCorpus, "budget_year") %in% 1998:2002) |
    (docvars(budgetCorpus, "member_name")=="Dr. Rory O'Hanlon" & docvars(budgetCorpus, "budget_year") %in% 2003:2007) |
    (docvars(budgetCorpus, "member_name")=="Mr. John O'Donoghue" & docvars(budgetCorpus, "budget_year") %in% 2008:2009) |
    (docvars(budgetCorpus, "member_name")=="Mr. Séamus Kirk" & docvars(budgetCorpus, "budget_year") %in% 2009:2011) |
    (docvars(budgetCorpus, "member_name")=="Mr. Sean Barrett" & docvars(budgetCorpus, "budget_year") %in% 2012:2013) 
table(docvars(subset(budgetCorpus, cheann_comhairle==TRUE), "member_name"), docvars(subset(budgetCorpus, cheann_comhairle==TRUE), "budget_year"))


# clean up some names and extract first and last names
docvars(budgetCorpus, "member_name") <- clean.name(docvars(budgetCorpus, "member_name"))
docvars(budgetCorpus, "last_name") <- get.lname(docvars(budgetCorpus, "member_name"))
docvars(budgetCorpus, "first_name") <- get.fname(docvars(budgetCorpus, "member_name"))



## external data to add

# read finance ministers 
fm.data <- read.csv("Data_finance_ministers_1983-2013.csv", header=TRUE)
docvars(budgetCorpus, "finance_minister") <- (docvars(budgetCorpus, "memberID") * 10000 +  docvars(budgetCorpus, "budget_year")) %in%
                                             (fm.data$memberID * 10000 + fm.data$year)
table(docvars(subset(budgetCorpus, finance_minister==TRUE), "member_name"), docvars(subset(budgetCorpus, finance_minister==TRUE), "budget_year"))


# read opposition spokespersons (shadow finance minister)
os.data <- read.csv("Data_opposition_spokesperson_1983-2013.csv",header=TRUE)
docvars(budgetCorpus, "opposition_spokesperson") <- (docvars(budgetCorpus, "memberID") * 10000 +  docvars(budgetCorpus, "budget_year")) %in%
    (os.data$memberID * 10000 + os.data$year)
table(docvars(subset(budgetCorpus, opposition_spokesperson==TRUE), "member_name"), docvars(subset(budgetCorpus, opposition_spokesperson==TRUE), "budget_year"))


################################################
## create year-speaker dataset to hold results
################################################

varstokeep <- grep("memberID|const|name|party|budget_year|cheann|finance|opposition|position|department",
                   names(docvars(budgetCorpus)))
yearspeakerData <- unique(docvars(budgetCorpus)[, varstokeep])

# identify the duplicated departments
inMoreThanOneDept <- which(duplicated(yearspeakerData[, c("memberID", "budget_year")]))
dupls <- yearspeakerData[,
                         c("memberID", "budget_year", "department")]
# split into a list
duplslist <- split(dupls, dupls$memberID*10000 + dupls$budget_year)
multidept <- do.call(rbind, lapply(duplslist, function(l) l[1,]))
# make it "wide" into dept2 dept3 dept4 columns
names(multidept)[which(names(multidept)=="department")] <- "department2"
multidept$department3 <- do.call(rbind, lapply(duplslist, function(l) l[2,]))$department
multidept$department4 <- do.call(rbind, lapply(duplslist, function(l) l[3,]))$department

# remove the duplicated entries
yearspeakerData <- yearspeakerData[-inMoreThanOneDept, ]
# merge in the extra depts
yearspeakerData <- merge(yearspeakerData, multidept, 
                         by=c("memberID", "budget_year"), all.x=TRUE)


#################
## dfm analysis
#################

table(budgetCorpus$documents$budget_year)
cat("Computing wordscores: \n")
textResults <- list()
for (yr in 1983:2013) {
  cat("  ...", yr, "\n")
  thisdfm <- dfm(subset(budgetCorpus, budget_year==yr & 
                          !leas_cheann_comhairle &
                          !cheann_comhairle),
                 groups="memberID", stem=FALSE, verbose=FALSE)
  thisrefOpp <- 
    docvars(budgetCorpus, "memberID")[which(docvars(budgetCorpus, "budget_year")==yr &
                                              docvars(budgetCorpus, "opposition_spokesperson"))][1]
  thisrefGovt <- 
    docvars(budgetCorpus, "memberID")[which(docvars(budgetCorpus, "budget_year")==yr &
                                              docvars(budgetCorpus, "finance_minister"))][1]
  minWordsThreshold <- 200
  thisdfm <- thisdfm[rowSums(thisdfm) > minWordsThreshold, ]
  refscores <- rep(NA, ndoc(thisdfm))
  refscores[which(docnames(thisdfm) == as.character(thisrefOpp))] <- -1 # opposition
  refscores[which(docnames(thisdfm) == as.character(thisrefGovt))] <- 1 # govt
  ws <- textmodel(thisdfm, refscores, smooth=1, model="wordscores")
  ts <- predict(ws, thisdfm, rescaling="mv", verbose=FALSE)
  textResults <- rbind(textResults, data.frame(budget_year = yr,
                                               memberID = as.numeric(docnames(thisdfm)),
                                               textscore = ts@textscores$textscore_mv))
}
cat("finished.\n")

# We save the progress because we switch to a new quanteda version now in 2.2
save(budgetCorpus,
     d,
     dupls,
     duplslist,
     fm.data,
     multidept,
     os.data,
     textResults,
     thisdfm,
     ts,
     ws,
     yearspeakerData,
     inMoreThanOneDept,
     minWordsThreshold,
     refscores,
     thisrefGovt,
     thisrefOpp,
     varstokeep,
     file="3_positions_before_senti.RData")
